| Project | Analysis of Chicago Real Estate Market |
|---|---|
| Subject | Machine Learning |
| Assignment | Final Assignment |
| Team Name | Group 4 |
| Team Members | Mary Kate Krusenoski Yunhao Dou Phill Betts Zijun Wu |
import numpy as np
import pandas as pd
import pickle
import datetime as dt
import missingno as msno
import sklearn.model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import math
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import Ridge
# suppressing pesky warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud
from autoimpute.imputations import SingleImputer, MultipleImputer, MiceImputer
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime
import folium
from folium.plugins import FastMarkerCluster
from folium.plugins import HeatMap
import json
# Read the file containing the data from redfin
df = pd.read_excel ('Sold_05222022.xlsx')
#Chicago Crime 2021
dfCrime = pd.read_csv ('ChiCrime.csv')
df.head()
# drop the columns that are not needed for building the model
df=df.drop(['Source.Name','SALE TYPE','STATE OR PROVINCE','STATUS',
'NEXT OPEN HOUSE START TIME',
'NEXT OPEN HOUSE END TIME',
'FAVORITE',
'INTERESTED',
'MLS#',
'STATUS',
'STATE OR PROVINCE',
'URL (SEE http://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)',#has missing data
'SOURCE',
'$/SQUARE FEET'
], axis=1)
#keep address for identification later
# remove special character as well as spaces in between column names
# removing them helps later in processing
df.columns = df.columns.str.replace(' ', '')
#For testing imputation methods
df1 = df.copy()
df = df1.copy()
dfCrime.info()
#october 31 2020 - 2021
dfCrime.head()
#Converting Community Area from numerical data to neighborhood to merge with Redfin LOCATION data
dfCrime["Community Area"] = dfCrime["Community Area"].astype(str)
Communitydict = {
'1.0':'Rogers Park',
'2.0':'West Ridge',
'3.0':'Uptown',
'4.0':'Lincoln Square',
'5.0':'North Center',
'6.0':'Lakeview',
'7.0':'Lincoln Park',
'8.0':'Near North Side',
'9.0':'Edison Park',
'10.0':'Norwood Park',
'11.0':'Jefferson Park',
'12.0':'Forest Glen',
'13.0':'North Park',
'14.0':'Albany Park',
'15.0':'Portage Park',
'16.0':'Irving Park',
'17.0':'Dunning',
'18.0':'Montclare',
'19.0':'Belmont Cragin',
'20.0':'Hermosa',
'21.0':'Avondale',
'22.0':'Logan Square',
'23.0':'Humboldt Park',
'24.0':'West Town',
'25.0':'Austin',
'26.0':'West Garfield Park',
'27.0':'East Garfield Park',
'28.0':'Near West Side',
'29.0':'North Lawndale',
'30.0':'South Lawndale',
'31.0':'Lower West Side',
'32.0':'Loop',
'33.0':'Near South Side',
'34.0':'Armour Square',
'35.0':'Douglas',
'36.0':'Oakland',
'37.0':'Fuller Park',
'38.0':'Grand Boulevard',
'39.0':'Kenwood',
'40.0':'Washington Park',
'41.0':'Hyde Park',
'42.0':'Woodlawn',
'43.0':'South Shore',
'44.0':'Chatham',
'45.0':'Avalon Park',
'46.0':'South Chicago',
'47.0':'Burnside',
'48.0':'Calumet Heights',
'49.0':'Roseland',
'50.0':'Pullman',
'51.0':'South Deering',
'52.0':'East Side',
'53.0':'West Pullman',
'54.0':'Riverdale',
'55.0':'Hegewisch',
'56.0':'Garfield Ridge',
'57.0':'Archer Heights',
'58.0':'Brighton Park',
'59.0':'McKinley Park',
'60.0':'Bridgeport',
'61.0':'New City',
'62.0':'West Elsdon',
'63.0':'Gage Park',
'64.0':'Clearing',
'65.0':'West Lawn',
'66.0':'Chicago Lawn',
'67.0':'West Englewood',
'68.0':'Englewood',
'69.0':'Greater Grand Crossing',
'70.0':'Ashburn',
'71.0':'Auburn Gresham',
'72.0':'Beverly',
'73.0':'Washington Heights',
'74.0':'Mount Greenwood',
'75.0':'Morgan Park',
'76.0':'O Hare',
'77.0':'Edgewater'
}
dfCrime["Community Area"] = dfCrime["Community Area"].map(Communitydict)
#Crime by type
dfCrime['Primary Type'].value_counts()
#We want to aggregate total crimes for the year per Community Area
crimeCountLocation = pd.DataFrame(dfCrime['Community Area'].value_counts().reset_index().values, columns=['Community Area', "AggregateCrime"])
crimeCountindex = crimeCountLocation.sort_index(axis = 0, ascending=True)
crimeCountindex
#Removing Locations with unmappable characters
df['LOCATION']=df['LOCATION'].str.replace('CHI - ','')
df = pd.merge(df, crimeCountindex, how='left', left_on='LOCATION', right_on='Community Area')
df.head()
#Dropping merged location column
df=df.drop(['Community Area'],axis = 1)
#Reviewing all property types. We will only include top 3 in our analysis
df['PROPERTYTYPE'].value_counts()
# converting the sold date field to a proper date time format
df['SOLDDATE'] = pd.to_datetime(df['SOLDDATE'])
df['SOLDDATE']=pd.DatetimeIndex(df['SOLDDATE']).year
df.dropna(subset = ['SOLDDATE'], inplace=True)
df['SOLDDATE']=df['SOLDDATE'].astype(int)
df['CITY'] = df.CITY.str.upper()
#Removing Arlington Heights
df = df[df['CITY'] != 'ARLINGTON HEIGHTS']
df['CITY'].value_counts()
#For listing on Cities
df1 = df.copy()
#Mapping data to numerical values
mapping_dict = {"PROPERTYTYPE":{
"Condo/Co-op": 1,
"Single Family Residential": 2,
"Townhouse": 3,
"Multi-Family (2-4 Unit)": 4,
"Multi-Family (5+ Unit)": 5,
"Other": 6,
"Vacant Land": 7,
"Parking":8
},
"CITY":{
'CHICAGO' : 1,
'OAK PARK' : 0,
'OAK LAWN' : 0,
'BERWYN' : 0,
'BURBANK' : 0,
'EVERGREEN PARK' : 0,
'ELMWOOD PARK' : 0,
'FOREST PARK' : 0,
'CICERO' : 0,
'RIVER FOREST' : 0,
'HOMETOWN' : 0,
'STICKNEY' : 0,
'LINCOLNWOOD' : 0,
'HARWOOD HEIGHTS' : 0,
'BRIDGEVIEW' : 0,
'SKOKIE' : 0,
'NORRIDGE' : 0,
'RIVERSIDE' : 0,
'NORTH RIVERSIDE' : 0,
'RIVER GROVE' : 0,
'BURNHAM' : 0,
'FOREST VIEW' : 0,
'RIVERDALE' : 0,
'CALUMET PARK' : 0,
'MELROSE PARK' : 0,
'NOTTINGHAM PARK' : 0,
'LAWNDALE' : 0,
'MAYWOOD' : 0,
'NILES' : 0},
"SOLDDATE":{
2016: 1,
2017: 2,
2018: 3,
2019: 4,
2020: 5,
2021: 6,
2022: 7
}
}
df = df.replace(mapping_dict)
#changing numerical datatypes
df['PROPERTYTYPE'] = df['PROPERTYTYPE'].astype('str')
df['CITY'] = df['CITY'].astype('str')
df['LOCATION'] = df['LOCATION'].astype('str')
df['SOLDDATE'] = df['SOLDDATE'].astype('int')
catmap = preprocessing.LabelEncoder()
df['PROPERTYTYPE'] = catmap.fit_transform(df['PROPERTYTYPE'].values)
df['LOCATION'] = catmap.fit_transform(df['LOCATION'].values)
df['CITY'] = catmap.fit_transform(df['CITY'].values)
df['ZIPORPOSTALCODE'] = catmap.fit_transform(df['ZIPORPOSTALCODE'].values)
df['YEARBUILT'] = catmap.fit_transform(df['YEARBUILT'].values)
df.info()
#Removing Multi-Family, Vacant lots, or Other property types
df = df[df['PROPERTYTYPE'] < 4]
df.shape
df.head()
# Since the data collection was done by multiple team members,
# there is a chance there are duplicates
df.drop_duplicates(keep='first',inplace=True)
df1.drop_duplicates(keep='first',inplace=True)
fig = plt.figure()
fig, ax = plt.subplots()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(False)
feq=df1['CITY'].value_counts().nlargest(5).sort_values(ascending=True)
feq.plot.barh(figsize=(10, 10), color="red", width=0.9)
plt.title("Top 5 Cities (Listings)", fontsize=20)
for i, v in enumerate(feq):
ax.text(v + 75, i - 0.1, str(v), color="black", fontweight='bold')
ax.set_xticks([])
plt.show()
#fig.savefig('./Photos/Top5.png', transparent=True)
df.shape
#Drop Blank Latitudes that account for a large missing portion of the data, including price
df.dropna(subset = ['LATITUDE'], inplace=True)
total = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(33)
df.groupby(['SOLDDATE']).mean()
#median price over time
table = pd.pivot_table(df, values='PRICE',columns=['SOLDDATE'], aggfunc=np.median)
table
# correlation heat map setup for seaborn
def corr_chart(df_corr):
corr=df_corr.corr()
#screen top half to get a triangle
top = np.zeros_like(corr, dtype=np.bool)
top[np.triu_indices_from(top)] = True
fig=plt.figure()
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(corr, mask=top, cmap='coolwarm',
center = 0, square=True,
linewidths=.5, cbar_kws={'shrink':.5},
annot = True, annot_kws={'size': 9}, fmt = '.3f')
plt.xticks(rotation=45) # rotate variable labels on columns (x axis)
plt.yticks(rotation=0) # use horizontal variable labels on rows (y axis)
plt.title('Correlation Heat Map')
plt.savefig('plot-corr-map.pdf',
bbox_inches = 'tight', dpi=None, facecolor='w', edgecolor='b',
orientation='portrait', papertype=None, format=None,
transparent=True, pad_inches=0.25, frameon=None)
np.set_printoptions(precision=3)
# with correlation matrix/heat map
corr_chart(df_corr = df)
#Imputing SQ Feet based on KNN nearest neighbors
import sklearn.impute
from sklearn.impute import SimpleImputer, KNNImputer
knniputer = KNNImputer()
knniputer.fit(df[["SQUAREFEET"]])
df["SQUAREFEET"] = knniputer.transform(df[["SQUAREFEET"]])
df["SQUAREFEET"] = df["SQUAREFEET"].apply(lambda x: int(x))
df["SQUAREFEET"].unique()
#impute HOA/ Month with zero.
#Assuming those without HOA's are likely houses without HOA's
df['HOA/MONTH'] = df['HOA/MONTH'].fillna(0)
#Dropping Days on Market and Lot Size since there is no correlation nor impact
df=df.drop(['DAYSONMARKET', 'LOTSIZE'],axis=1)
#impute aggregate crime with the median
df['AggregateCrime']=df['AggregateCrime'].fillna(df['AggregateCrime'].median())
total = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(33)
df.dropna(inplace=True)
#Longitude has error '0' in data and two large homes with 10000 square feet
df = df[df['LONGITUDE'] != 0]
df = df[df['SQUAREFEET'] < 9999]
#post clean data this is the final size of our dataset
df.shape
Quick visualization to see obvious gaps in data
m=folium.Map([41.85, -87.68],zoom_start=11)
HeatMap(df[['LATITUDE','LONGITUDE']].dropna(),radius=8,gradient={0.25:'blue',0.5:'purple',0.75:'orange',1.0:'red'}).add_to(m)
display(m)
# Removing zip codes related to Arlington heights
df=df[~df.ZIPORPOSTALCODE.isin([60004, 60005, 60006, 60008, 60056, 60095])]
df[df['CITY'] == 'Arlington Heights']['ZIPORPOSTALCODE'].value_counts()
m=folium.Map([41.85, -87.68],zoom_start=11)
HeatMap(df[['LATITUDE','LONGITUDE']].dropna(),radius=8,gradient={0.25:'blue',0.5:'purple',0.75:'orange',1.0:'red'}).add_to(m)
display(m)
lats2018 = df['LATITUDE'].tolist()
lons2018 = df['LONGITUDE'].tolist()
locations = list(zip(lats2018, lons2018))
map1 = folium.Map(location=[41.85, -87.68], zoom_start=11.5)
FastMarkerCluster(data=locations).add_to(map1)
map1
df.info()
df.describe()
df.plot(kind="scatter", x="SQUAREFEET", y="PRICE")
df[df['SQUAREFEET']<600]['SQUAREFEET'].value_counts().sort_index()
#We want to remove houses listed with less than 176 as we can assume these are errors
df = df[df['SQUAREFEET']>=176]
plt.figure(figsize=(6,6))
sns.countplot(df['BEDS'])
plt.xlabel('Beds')
plt.ylabel('Count')
df.BEDS.value_counts().sort_index(ascending=True)
plt.figure(figsize=(6,6))
sns.countplot(df['BATHS'])
plt.xlabel('Baths')
plt.ylabel('Count')
df.BATHS.value_counts().sort_index(ascending=True)
#These are likely skewed data
df = df[df['BATHS'] <14]
filter_values = [ 50000, 500000,1000000,5000000 ]
fig = plt.figure()
fig, ax = plt.subplots()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(False)
feq=df['PRICE'].value_counts(bins=filter_values).sort_values(ascending=True)
feq.plot.barh(figsize=(10,6), color="red", width=0.5)
plt.title("Price Range (in USD)", fontsize=20)
for i, v in enumerate(feq):
ax.text(v + 10, i - 0.1, str(v), color="black", fontweight='bold')
plt.tight_layout()
plt.show()
# This is common function to calcuate the Interquartile range
def outlier_lower_iqr(col_name, th1=0.25, th3=0.75):
quartile1 = df[col_name].quantile(th1)
quartile3 = df[col_name].quantile(th3)
iqr = quartile3 - quartile1
lower_limit = quartile1 - 1.5 * iqr
return lower_limit
def outlier_upper_iqr(col_name, th1=0.25, th3=0.75):
quartile1 = df[col_name].quantile(th1)
quartile3 = df[col_name].quantile(th3)
iqr = quartile3 - quartile1
upper_limit = quartile3 + 1.5 * iqr
return upper_limit
#Remove Price Outliers
num_list=[ "PRICE"]
for i in num_list:
df=df[df[i].between(outlier_lower_iqr(i), outlier_upper_iqr(i))]
fig = plt.figure()
ax = sns.distplot(df['PRICE'], color="red")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)
plt.title("Price", fontsize=20)
ax.set_xticks([])
ax.set_yticks([])
plt.show()
print("Skewness of PRICE: {}".format(df['PRICE'].skew()))
df1 =df.copy()
df1['PRICE'] = df1['PRICE'].apply(lambda x: math.log(x))
print("Skewness of PRICE: {}".format(df1['PRICE'].skew()))
#Data is more skewed when logged, will keep as is
RANDOM_SEED = 50
# splitting data into test and train based on SOLDDATE(YEAR)
X = df.loc[:, ['SOLDDATE', 'PROPERTYTYPE', 'CITY', 'ZIPORPOSTALCODE', 'BEDS', 'BATHS', 'LOCATION', 'SQUAREFEET', 'YEARBUILT',
'HOA/MONTH', 'LATITUDE', 'LONGITUDE', 'AggregateCrime','PRICE']]
#Pre-covid prices should offset price increase in 2021/22 and show us pricing more in-line with
#what we would have expected before covid and inflation
train_data=X[X['SOLDDATE']<6]
test_data=X[X['SOLDDATE']>=6]
X_train = train_data.drop(['PRICE'], axis = 1)
X_test = test_data.drop(['PRICE'], axis = 1)
y_train = train_data["PRICE"]
y_test = test_data["PRICE"]
len(X_train), len(y_train), len(X_test), len(y_test), len(train_data)
#Creating RMSE Scoring Function
def rmse(predict, actual):
predict = np.array(predict)
actual = np.array(actual)
distance = predict - actual
square_distance = distance ** 2
mean_square_distance = square_distance.mean()
score = np.sqrt(mean_square_distance)
return score
rmse_score = make_scorer(rmse, greater_is_better = False)
from sklearn.model_selection import cross_val_score
# function to get cross validation scores
def get_cv_scores(model):
scores = cross_val_score(model,
X_train,
y_train,
cv=5,
scoring='r2')
print('CV Mean: ', np.mean(scores))
print('STD: ', np.std(scores))
print('\n')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import metrics
from sklearn.linear_model import LinearRegression
# Train model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
# get cross val scores
get_cv_scores(lin_reg)
Cross Validation score is low for linear regression try to improve using Ridge, Lasso, and Elastic Net.
prediction_linear = lin_reg.predict(X_test)
prediction_linear
linear_mse = metrics.mean_squared_error(y_test, prediction_linear)
linear_mse
linear_rmse = np.sqrt(linear_mse)
linear_rmse
lin_reg.score(X_train, y_train)
rmse(y_test, prediction_linear)
from sklearn.linear_model import Ridge
# Train model with default alpha=1
ridge = Ridge(alpha=1).fit(X_train, y_train)
# get cross val scores
get_cv_scores(ridge)
CV mean still low
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)
grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring=rmse_score, verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
even with best parameters RMSE score largely negative
ridge.score(X_train, y_train)
y_pred_rd = ridge.predict(X_test)
rmse(y_test, y_pred_rd)
from sklearn.linear_model import Lasso
# Train model with default alpha=1
lasso = Lasso(alpha=1).fit(X_train, y_train)
# get cross val scores
get_cv_scores(lasso)
CV mean similar to linear regression and ridge regression
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)
grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring=rmse_score, verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
even with best parameters RMSE score largely negative
lasso.score(X_train, y_train)
y_pred_ls = lasso.predict(X_test)
rmse(y_test, y_pred_ls)
from sklearn.linear_model import ElasticNet
# Train model with default alpha=1 and l1_ratio=0.5
elastic_net = ElasticNet(alpha=1, l1_ratio=0.5).fit(X_train, y_train)
# get cross val scores
get_cv_scores(elastic_net)
CV mean is even lower for elastic net than linear regression and ridge resgression
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
l1_ratio = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_grid = dict(alpha=alpha, l1_ratio=l1_ratio)
grid = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring=rmse_score, verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
elastic_net.score(X_train, y_train)
y_pred_en = elastic_net.predict(X_test)
rmse(y_test, y_pred_en)
RMSE score is not optimal.
criterion = ["mse", "mae"]
min_samples_split = [2, 5, 10, 20]
min_samples_leaf = [1, 20, 40, 100]
param_grid = {"criterion": criterion,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf,
}
# Use GridSearchCV to search for best hyperparameters
# First create the base model to tune
decision_tree = DecisionTreeRegressor(random_state=RANDOM_SEED)
# Using 3 fold cross validation
grid_search = GridSearchCV(decision_tree, param_grid = param_grid, cv=3, scoring=rmse_score, n_jobs = -1)
grid_search.fit(X_train, y_train)
final = grid_search.best_params_
print(final)
dt = DecisionTreeRegressor(criterion= 'mse', min_samples_leaf= 40, min_samples_split= 2,random_state=RANDOM_SEED)
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)
print('RMSE Score:',rmse(y_test, y_pred_dt))
dt.score(X_train, y_train)
get_cv_scores(dt)
features = X_train.columns
importances = dt.feature_importances_
indices = np.argsort(importances)
fig = plt.figure(figsize=(25,25))
fig, ax = plt.subplots()
#plt.figure( figsize=(20,20) )
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(False)
plt.barh(range(len(indices)), importances[indices], align='center', color="red")
plt.yticks(range(len(indices)), [features[i] for i in indices]);
plt.xlabel('Relative Importance')
plt.tight_layout()
plt.show()
# splitting data into test and train based on SOLDDATE(YEAR)
X = df.loc[:, ['SOLDDATE', 'PROPERTYTYPE', 'CITY', 'ZIPORPOSTALCODE', 'BEDS', 'BATHS', 'LOCATION', 'SQUAREFEET', 'YEARBUILT',
'HOA/MONTH', 'LATITUDE', 'LONGITUDE', 'AggregateCrime','PRICE']]
#Pre-covid prices should offset price increase in 2021/22 and show us pricing more in-line with
#what we would have expected before covid and inflation
train_data=X[X['SOLDDATE']<7]
test_data=X[X['SOLDDATE']>=6]
X_train = train_data.drop(['PRICE'], axis = 1)
X_test = test_data.drop(['PRICE'], axis = 1)
y_train = train_data["PRICE"]
y_test = test_data["PRICE"]
# Create the parameter grid based on the results of random search
param_grid = {
'bootstrap': [True],
'max_depth': [80, 90, 100, 110],
'max_features': [2, 3],
'min_samples_leaf': [3, 4, 5],
'min_samples_split': [8, 10, 12],
'n_estimators': [100, 200, 300, 1000]
}
# Use GridSearchCV to search for best hyperparameters
# First create the base model to tune
random_forest = RandomForestRegressor(random_state=RANDOM_SEED)
# Using 3 fold cross validation
rf_grid = GridSearchCV(estimator = random_forest, param_grid = param_grid,
cv = 3, n_jobs = -1, verbose = 2)
# Fit the random search model
rf_grid.fit(X_train, y_train)
rf_grid.best_params_
rf = RandomForestRegressor(n_estimators= 300,
min_samples_split=8,
min_samples_leaf = 3,
max_depth = 80,
max_features = 3,
bootstrap = True,
random_state = RANDOM_SEED)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print('RMSE Score:',rmse(y_test, y_pred_rf))
rf.score(X_train, y_train)
get_cv_scores(rf)
dfp = pd.read_excel ('ForSale05222022.xlsx')
dfp.info()
dfp.drop_duplicates(keep='first',inplace=True)
dfp=dfp.drop(['Source.Name','SALE TYPE','STATE OR PROVINCE','STATUS',
'NEXT OPEN HOUSE START TIME',
'NEXT OPEN HOUSE END TIME',
'FAVORITE',
'INTERESTED',
'MLS#',
'STATUS',
'STATE OR PROVINCE',
'URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)',#has missing data
'SOURCE',
'$/SQUARE FEET'
], axis=1)
# remove special character as well as spaces in between column names
# removing them helps later in processing
dfp.columns = dfp.columns.str.replace(' ', '')
dfp = pd.merge(dfp, crimeCountindex, how='left', left_on='LOCATION', right_on='Community Area')
dfp.head()
dfp=dfp.drop(['DAYSONMARKET','LOTSIZE','Community Area'],axis=1)
dfp['AggregateCrime']=dfp['AggregateCrime'].fillna(dfp['AggregateCrime'].median())
# converting the sold date field to a proper date time format
df['SOLDDATE'] = pd.to_datetime(df['SOLDDATE'])
df['SOLDDATE']=pd.DatetimeIndex(df['SOLDDATE']).year
df.dropna(subset = ['SOLDDATE'], inplace=True)
df['SOLDDATE']=df['SOLDDATE'].astype(int)
dfp['CITY'] = dfp.CITY.str.upper()
dfp = dfp[dfp['CITY'] != 'ARLINGTON HEIGHTS']
#Mapping data to numerical values
mapping_dict = {"PROPERTYTYPE":{
"Condo/Co-op": 1,
"Single Family Residential": 2,
"Townhouse": 3,
"Multi-Family (2-4 Unit)": 4,
"Multi-Family (5+ Unit)": 5,
"Other": 6,
"Vacant Land": 7,
"Parking":8
},
"CITY":{
'CHICAGO' : 1,
'OAK PARK' : 0,
'OAK LAWN' : 0,
'BERWYN' : 0,
'BURBANK' : 0,
'EVERGREEN PARK' : 0,
'ELMWOOD PARK' : 0,
'FOREST PARK' : 0,
'CICERO' : 0,
'RIVER FOREST' : 0,
'HOMETOWN' : 0,
'STICKNEY' : 0,
'LINCOLNWOOD' : 0,
'HARWOOD HEIGHTS' : 0,
'BRIDGEVIEW' : 0,
'SKOKIE' : 0,
'NORRIDGE' : 0,
'RIVERSIDE' : 0,
'NORTH RIVERSIDE' : 0,
'RIVER GROVE' : 0,
'BURNHAM' : 0,
'FOREST VIEW' : 0,
'RIVERDALE' : 0,
'CALUMET PARK' : 0,
'MELROSE PARK' : 0,
'NOTTINGHAM PARK' : 0,
'LAWNDALE' : 0,
'MAYWOOD' : 0,
'NILES' : 0},
"SOLDDATE":{
2016: 1,
2017: 2,
2018: 3,
2019: 4,
2020: 5,
2021: 6,
2022: 7
}
}
df = df.replace(mapping_dict)
dfp['PROPERTYTYPE'] = dfp['PROPERTYTYPE'].astype('str')
dfp['CITY'] = dfp['CITY'].astype('str')
dfp['LOCATION'] = dfp['LOCATION'].astype('str')
dfp['SOLDDATE'] = 7
dfp['SOLDDATE']=dfp['SOLDDATE'].astype(int)
catmap = preprocessing.LabelEncoder()
dfp['PROPERTYTYPE'] = catmap.fit_transform(dfp['PROPERTYTYPE'].values)
dfp['CITY'] = catmap.fit_transform(dfp['CITY'].values)
dfp['LOCATION'] = catmap.fit_transform(dfp['LOCATION'].values)
dfp['ZIPORPOSTALCODE'] = catmap.fit_transform(dfp['ZIPORPOSTALCODE'].values)
dfp['YEARBUILT'] = catmap.fit_transform(dfp['YEARBUILT'].values)
from sklearn.impute import SimpleImputer, KNNImputer
knniputer = KNNImputer()
knniputer.fit(dfp[["SQUAREFEET"]])
dfp["SQUAREFEET"] = knniputer.transform(dfp[["SQUAREFEET"]])
dfp["SQUAREFEET"] = dfp["SQUAREFEET"].apply(lambda x: int(x))
dfp['HOA/MONTH'] = dfp['HOA/MONTH'].fillna(0)
total = dfp.isnull().sum().sort_values(ascending=False)
percent_1 = dfp.isnull().sum()/dfp.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(33)
x_pred = dfp.loc[:, ['SOLDDATE', 'PROPERTYTYPE', 'CITY', 'ZIPORPOSTALCODE', 'BEDS', 'BATHS', 'LOCATION', 'SQUAREFEET', 'YEARBUILT',
'HOA/MONTH', 'LATITUDE', 'LONGITUDE', 'AggregateCrime']]
y_pred = dfp["PRICE"]
y_actual_predictions = rf.predict(x_pred)
dff = pd.DataFrame({'ActualPrice': y_pred, 'PredictedPrice': y_actual_predictions})
# Reversing log normalization
#dff['PredictedPrice'] = dff['PredictedPrice'].apply(lambda x: np.exp(x))
#dff['ActualPrice'] = dff['ActualPrice'].apply(lambda x: np.exp(x))
dff=pd.concat([dff,x_pred], axis=1)
dff['PERCENTAGE']=round(((dff['PredictedPrice']-dff['ActualPrice'])/dff['ActualPrice'])*100, 2)
dff['PERCENTAGE'].value_counts()
filter_values = [ -100, 0, 50, 100]
fig = plt.figure()
fig, ax = plt.subplots()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(False)
feq=dff['PERCENTAGE'].value_counts(bins=filter_values).sort_index()
feq.plot.barh(figsize=(8,8), color="red", width=0.9)
plt.title("Percentage Difference", fontsize=20)
for i, v in enumerate(feq):
ax.text(v + 1, i - 0.1, str(v), color="black", fontweight='bold')
ax.set_xticks([])
plt.show()
badbuys=pd.DataFrame(dff[['LATITUDE', 'LONGITUDE']][dff['PERCENTAGE']<50])
goodbuys=pd.DataFrame(dff[['LATITUDE', 'LONGITUDE']][dff['PERCENTAGE']>=50])
goodbuyslocationlist = goodbuys.values.tolist()
badbuyslocationlist = badbuys.values.tolist()
map = folium.Map(location=[41.85, -87.68], zoom_start=12)
for point in range(0, len(goodbuyslocationlist)):
folium.Marker(goodbuyslocationlist[point], icon=folium.Icon(color='green', icon='home')).add_to(map)
map
map = folium.Map(location=[41.85, -87.68], zoom_start=12)
for point in range(0, len(badbuyslocationlist)):
folium.Marker(badbuyslocationlist[point], icon=folium.Icon(color='red', icon='home')).add_to(map)
map
df.dropna(subset = ['PRICE'], inplace=True)
df['PRICE']=df['PRICE'].astype('int')
num_list=[ "PRICE"]
for i in num_list:
df=df[df[i].between(outlier_lower_iqr(i), outlier_upper_iqr(i))]
highest=pd.DataFrame(df.nlargest(5, 'PRICE'))
costliest=pd.DataFrame(highest[['LATITUDE', 'LONGITUDE']])
costliestlist = costliest.values.tolist()
map = folium.Map(location=[41.85, -87.68], zoom_start=12)
for point in range(0, len(costliestlist)):
folium.Marker(costliestlist[point], icon=folium.Icon(color='red', icon='home')).add_to(map)
map